In [76]:
plotly.offline.init_notebook_mode()

import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

from sklearn.metrics import roc_auc_score
In [2]:
df = pd.read_csv("/Users/tyler/Portfolio/creditcard.csv")
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     284807 non-null  float64
 22  V22     284807 non-null  float64
 23  V23     284807 non-null  float64
 24  V24     284807 non-null  float64
 25  V25     284807 non-null  float64
 26  V26     284807 non-null  float64
 27  V27     284807 non-null  float64
 28  V28     284807 non-null  float64
 29  Amount  284807 non-null  float64
 30  Class   284807 non-null  int64  
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
In [4]:
df.columns
Out[4]:
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
In [5]:
# See how the data looks
df.sample(5)
Out[5]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
181360 124943.0 0.009787 0.169569 0.059393 -0.266553 1.046478 2.039161 0.135165 -0.119449 -0.950750 ... 0.106270 1.219889 -0.246860 -0.927909 -0.830785 0.312845 -0.551278 -0.252516 7.81 0
139112 83019.0 1.044653 -0.066710 0.508579 0.462502 -0.412017 -0.304318 -0.044470 0.050467 -0.221082 ... -0.131636 -0.529180 0.129618 0.240069 0.041235 0.192747 -0.036193 0.016757 64.01 0
162714 115313.0 -1.152682 0.352632 2.698813 4.415090 0.185743 1.353093 -0.510100 0.666712 -1.517871 ... 0.150948 0.166889 0.081464 -0.467486 -0.018890 0.424690 0.321357 0.172998 98.31 0
34598 37744.0 -1.204590 0.922388 1.160011 0.251146 1.026198 -0.932686 0.696555 0.064930 -0.425765 ... 0.041679 -0.074861 -0.333049 -0.087232 0.575269 -0.385513 -0.117246 0.165928 1.00 0
112216 72564.0 -2.467981 0.751884 0.458208 -0.328720 -0.026359 -1.309760 -0.052897 0.663812 -0.843688 ... 0.219117 0.316258 -0.484254 0.620798 -0.027727 0.135159 -0.267201 -0.063993 12.59 0

5 rows × 31 columns

In [6]:
# Check NA
print(df.isnull().sum())
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
In [7]:
df_Class_count = pd.DataFrame(df["Class"].value_counts()).reset_index()
df_Class_count.columns = ["Class", "Freq"]

fig = px.pie(df_Class_count, values="Freq", names="Class", 
             title="Not Fraud(0) vs Fraud(1)",
             hole=0.4,
             width=800,
             height=600)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()
print("Data is highly imbalanced with respect with target variable 'Class',\
\nsince 0.17% of transaction is fraud in total.")
Data is highly imbalanced with respect with target variable 'Class',
since 0.17% of transaction is fraud in total.
In [18]:
X.columns
Out[18]:
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')
In [8]:
X = df.iloc[:,0:30]
Y = df.iloc[:,-1]
Random_State = 2023
In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=Random_State)

X_train, X_valid, Y_train, Y_valid  = train_test_split(X_train, Y_train, test_size=0.25, random_state=Random_State)

# 60/20/20 split
In [12]:
rand_clf = RandomForestClassifier(random_state=Random_State)
In [13]:
rand_clf.fit(X_train, Y_train)
Out[13]:
RandomForestClassifier(random_state=2023)
In [31]:
pred_clf = rand_clf.predict(X_valid)
In [47]:
tmp_clf = pd.DataFrame({'Feature': X.columns, 'Feature importance': rand_clf.feature_importances_})
tmp_clf = tmp_clf.sort_values(by='Feature importance',ascending=False)

fig = px.bar(tmp_clf, x="Feature", y="Feature importance", color = "Feature",
             title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
In [35]:
roc_auc_score(Y_valid.values, pred_clf)
Out[35]:
0.8921128951052808
In [51]:
Ada_clf = AdaBoostClassifier(n_estimators=100, random_state=Random_State)
In [52]:
Ada_clf.fit(X_train, Y_train)
Out[52]:
AdaBoostClassifier(n_estimators=100, random_state=2023)
In [57]:
tmp_Ada = pd.DataFrame({'Feature': X.columns, 'Feature importance': Ada_clf.feature_importances_})
tmp_Ada = tmp_Ada.sort_values(by='Feature importance',ascending=False)

fig = px.bar(tmp_Ada, x="Feature", y="Feature importance", color = "Feature",
             title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
In [58]:
pred_Ada = Ada_clf.predict(X_valid)
In [59]:
roc_auc_score(Y_valid.values, pred_Ada)
Out[59]:
0.8919458180739759
In [80]:
Cbc_clf = CatBoostClassifier(iterations = 500,
                             od_wait=100
                             ,random_seed = Random_State,)
In [82]:
Cbc_clf.fit(X_train, Y_train, verbose=False)
Out[82]:
<catboost.core.CatBoostClassifier at 0x7fd598a5b040>
In [84]:
tmp_Cbc = pd.DataFrame({'Feature': X.columns, 'Feature importance': Cbc_clf.feature_importances_})
tmp_Cbc = tmp_Cbc.sort_values(by='Feature importance',ascending=False)

fig = px.bar(tmp_Cbc, x="Feature", y="Feature importance", color = "Feature",
             title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
In [83]:
pred_Cbc = Cbc_clf.predict(X_valid)
In [85]:
roc_auc_score(Y_valid.values, pred_Cbc)
Out[85]:
0.8970060623616312
In [87]:
dtrain = xgb.DMatrix(X_train, Y_train.values)
dvalid = xgb.DMatrix(X_valid, Y_valid.values)
dtest = xgb.DMatrix(X_test, Y_test.values)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.039
params['silent'] = True
params['max_depth'] = 2
params['subsample'] = 0.8
params['colsample_bytree'] = 0.9
params['eval_metric'] = 'auc'
params['random_state'] = Random_State
In [91]:
model = xgb.train(params, 
                dtrain, 
                1000, 
                watchlist, 
                early_stopping_rounds=50, 
                maximize=True, 
                verbose_eval=50)
[01:50:42] WARNING: /Users/runner/miniforge3/conda-bld/xgboost-split_1667849653518/work/src/learner.cc:767: 
Parameters: { "silent" } are not used.

[0]	train-auc:0.86326	valid-auc:0.86749
/Users/tyler/opt/anaconda3/lib/python3.9/site-packages/xgboost/core.py:617: FutureWarning:

Pass `evals` as keyword args.

[50]	train-auc:0.92639	valid-auc:0.93594
[100]	train-auc:0.93248	valid-auc:0.95022
[150]	train-auc:0.96907	valid-auc:0.98326
[200]	train-auc:0.98785	valid-auc:0.98955
[250]	train-auc:0.99164	valid-auc:0.99156
[300]	train-auc:0.99387	valid-auc:0.99233
[350]	train-auc:0.99590	valid-auc:0.99297
[400]	train-auc:0.99723	valid-auc:0.99225
[403]	train-auc:0.99729	valid-auc:0.99212
In [108]:
feature_important = model.get_score(importance_type='gain')
keys = list(feature_important.keys())
values = list(feature_important.values())

tmp_gbm = pd.DataFrame({'Feature': keys, 'Feature importance': values}).sort_values(by='Feature importance',ascending=False)

fig = px.bar(tmp_gbm, x="Feature", y="Feature importance", color = "Feature",
             title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
In [93]:
pred_xgb = model.predict(dtest)
In [94]:
roc_auc_score(Y_test.values, pred_xgb)
Out[94]:
0.980369660524218
In [113]:
print("This dataset consists of one target variable and thirty explanatory variables.\
\nThe target variable 'Class' has two indicators: 0 and 1, which denote 'Not Fraud transaction' and 'Fraud transaction', respectively.\
\nWe can observe that this dataset is highly imbalanced, with only 0.17% of rows belonging to fraudulent transactions.\
\nThis could be because credit card fraud is a rare occurrence, and cardholders may not always be aware of fraudulent charges on their bill.\
\n\nTo avoid overfitting, the data was split into three sets: train, validation, and test.\
\nFour ensemble methods were implemented: Random Forest classifier, AdaBoost classifier, CatBoost classifier, and XGBoost model.\
\nAdditionally, ROC_AUC scoring was conducted, as it is a suitable method for measuring accuracy in imbalanced data.\
\n\nAlthough the XGBoost model performed with a great score of 0.98, there is still room for improvement,\
\nsuch as over-sampling, under-sampling, applying other ensemble models, etc.")
This dataset consists of one target variable and thirty explanatory variables.
The target variable 'Class' has two indicators: 0 and 1, which denote 'Not Fraud transaction' and 'Fraud transaction', respectively.
We can observe that this dataset is highly imbalanced, with only 0.17% of rows belonging to fraudulent transactions.
This could be because credit card fraud is a rare occurrence, and cardholders may not always be aware of fraudulent charges on their bill.

To avoid overfitting, the data was split into three sets: train, validation, and test.
Four ensemble methods were implemented: Random Forest classifier, AdaBoost classifier, CatBoost classifier, and XGBoost model.
Additionally, ROC_AUC scoring was conducted, as it is a suitable method for measuring accuracy in imbalanced data.

Although the XGBoost model performed with a great score of 0.98, there is still room for improvement,
such as over-sampling, under-sampling, applying other ensemble models, etc.